More information about the project & data can be found on github: Click here for more information
# Data exploration
head(order_prior, 12) # Details of all the purchased products in a given order for each order_id
## order_id product_id add_to_cart_order reordered
## 1 2 33120 1 1
## 2 2 28985 2 1
## 3 2 9327 3 0
## 4 2 45918 4 1
## 5 2 30035 5 0
## 6 2 17794 6 1
## 7 2 40141 7 1
## 8 2 1819 8 1
## 9 2 43668 9 0
## 10 3 33754 1 1
## 11 3 24838 2 1
## 12 3 17704 3 1
head(order_train, 12) # Subset of training data in same format as order_prior
## order_id product_id add_to_cart_order reordered
## 1 1 49302 1 1
## 2 1 11109 2 1
## 3 1 10246 3 0
## 4 1 49683 4 0
## 5 1 43633 5 1
## 6 1 13176 6 0
## 7 1 47209 7 0
## 8 1 22035 8 1
## 9 36 39612 1 0
## 10 36 19660 2 1
## 11 36 49235 3 0
## 12 36 43086 4 1
head(orders, 12) # History of all orders for user and the associated order_id
## order_id user_id eval_set order_number order_dow order_hour_of_day
## 1 2539329 1 prior 1 2 8
## 2 2398795 1 prior 2 3 7
## 3 473747 1 prior 3 3 12
## 4 2254736 1 prior 4 4 7
## 5 431534 1 prior 5 4 15
## 6 3367565 1 prior 6 2 7
## 7 550135 1 prior 7 1 9
## 8 3108588 1 prior 8 1 14
## 9 2295261 1 prior 9 1 16
## 10 2550362 1 prior 10 4 8
## 11 1187899 1 train 11 4 8
## 12 2168274 2 prior 1 2 11
## days_since_prior_order
## 1 NA
## 2 15
## 3 21
## 4 29
## 5 28
## 6 19
## 7 20
## 8 14
## 9 0
## 10 30
## 11 14
## 12 NA
head(products) # Names of all the products
## product_id product_name
## 1 1 Chocolate Sandwich Cookies
## 2 2 All-Seasons Salt
## 3 3 Robust Golden Unsweetened Oolong Tea
## 4 4 Smart Ones Classic Favorites Mini Rigatoni With Vodka Cream Sauce
## 5 5 Green Chile Anytime Sauce
## 6 6 Dry Nose Oil
## aisle_id department_id
## 1 61 19
## 2 104 13
## 3 94 7
## 4 38 1
## 5 5 13
## 6 11 11
head(aisles) # Names of all the aisles
## aisle_id aisle
## 1 1 prepared soups salads
## 2 2 specialty cheeses
## 3 3 energy granola bars
## 4 4 instant foods
## 5 5 marinades meat preparation
## 6 6 other
head(depts) # Names of all the departments
## department_id department
## 1 1 frozen
## 2 2 other
## 3 3 bakery
## 4 4 produce
## 5 5 alcohol
## 6 6 international
# Response variable: reordered (1/0)
# We want to predict whether a given product is reordered or not by a customer
### put info and distribution of response here, its relationship w/ other variables
par(mfrow=c(2,2))
# Distribution of order by hour of day
ggplot(orders, aes(x = order_hour_of_day, fill = as.factor(order_hour_of_day))) +
geom_histogram(bins = 24) +
labs(title = "Order Frequency by Hour",
x = "Time of Day") +
theme_Publication() +
theme(legend.position = "none")
# See how day of week variable is coded (numeric)
class(orders$order_dow)
## [1] "integer"
# Changing DOW variable to be a factor variable with character labels
orders %>%
mutate(order_dow = factor(order_dow, labels = c("Saturday", "Sunday", "Monday",
"Tuesday", "Wednesday", "Thursday",
"Friday"))) %>%
ggplot(., aes(x = order_dow, fill = as.factor(order_dow))) +
geom_bar(width = 0.75, ) +
labs(title = "Order Frequency by Day",
x = " Day of the Week") +
theme_Publication() + # Distribution of order by day of the week
scale_fill_Publication() +
theme(legend.position = "none")
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
## Warning: The `scale_name` argument of `discrete_scale()` is deprecated as of ggplot2
## 3.5.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Distribution of order frequency
ggplot(orders, aes(x = days_since_prior_order, fill = as.factor(days_since_prior_order))) +
geom_histogram(bins = 30) +
labs(title = "Frequency of Customer Ordering",
x = " Days since Prior Order") +
theme_Publication() +
theme(legend.position = "none") # We see most reorders happen bw/ 1-8 days. Interesting pattern -- we see a spike in reordering every 7 days. Finally, we see the largest single spike at 30 days -- not sure as if I can interpret this as just 30 days or maybe 30+ days.
## Warning: Removed 206209 rows containing non-finite outside the scale range
## (`stat_bin()`).
# How many NA values are in our "days_since_prior_order" column
table(is.na(orders$days_since_prior_order)) # 206209
##
## FALSE TRUE
## 3214874 206209
# Most ordered products?
## subset of product data
product_freq_table <- order_prior %>%
count(product_id) %>% # count frequency of each product
arrange(desc(n)) %>% # arrange in descending order
inner_join(., products, by = "product_id") %>% # merge data sets to bring in associated product names
select(-c(aisle_id, department_id)) %>% # subset of only columns product_id, name, and frequency (n)
slice(1:20) # access the 20 most frequent orders
## plot of most frequently ordered products
ggplot(product_freq_table, aes(x = reorder(product_name, -n), y = n, fill = as.factor(product_name))) +
geom_bar(stat = "identity") +
labs(title = "Product Order Frequency",
x = "Name of Product",
y = "Total orders") +
theme_Publication() +
scale_colour_Publication() +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1))
## plot of most frequently ordered products, by aisle
product_freq_table %>%
left_join(products, by = "product_name") %>%
select(n, department_id, aisle_id) %>%
left_join(depts, by = "department_id") %>%
left_join(aisles, by = "aisle_id") %>%
select(n, department, aisle) %>%
ggplot(., aes(x = reorder(aisle, -n), y = n, fill = as.factor(aisle))) +
geom_bar(stat = "identity") +
labs(title = "Product Order Frequency",
x = "Name of Aisle",
y = "Total orders") +
theme_Publication() +
scale_colour_Publication() +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1))
# Create a full data set of all orders with all information on each order -- order_id, user_id, all products in that order, the associated aisle & department, etc.
full_order_data <- orders %>%
full_join(., order_train, by = "order_id") %>%
inner_join(., order_prior, by = "order_id") %>%
arrange(user_id, order_number) %>%
mutate(product_id = product_id.y,
add_to_cart_order = add_to_cart_order.y,
reordered = reordered.y,
order_dow = factor(order_dow,
levels = c(0, 1, 2, 3, 4, 5, 6),
labels = c("Saturday", "Sunday", "Monday",
"Tuesday", "Wednesday", "Thursday", "Friday"))) %>%
select(-c(product_id.x, add_to_cart_order.x, reordered.x,
product_id.y, add_to_cart_order.y, reordered.y)) %>%
inner_join(., products, by = "product_id") %>%
inner_join(., aisles, by = "aisle_id") %>%
inner_join(., depts, by = "department_id") %>%
select(-c(aisle_id, department_id, product_id))
head(full_order_data) # looks good
## order_id user_id eval_set order_number order_dow order_hour_of_day
## 1 2539329 1 prior 1 Monday 8
## 2 2539329 1 prior 1 Monday 8
## 3 2539329 1 prior 1 Monday 8
## 4 2539329 1 prior 1 Monday 8
## 5 2539329 1 prior 1 Monday 8
## 6 2398795 1 prior 2 Tuesday 7
## days_since_prior_order add_to_cart_order reordered
## 1 NA 1 0
## 2 NA 2 0
## 3 NA 3 0
## 4 NA 4 0
## 5 NA 5 0
## 6 15 1 1
## product_name aisle department
## 1 Soda soft drinks beverages
## 2 Organic Unsweetened Vanilla Almond Milk soy lactosefree dairy eggs
## 3 Original Beef Jerky popcorn jerky snacks
## 4 Aged White Cheddar Popcorn popcorn jerky snacks
## 5 XL Pick-A-Size Paper Towel Rolls paper goods household
## 6 Soda soft drinks beverages
# Remove extra objects from environment
rm(order_prior, order_train, orders)
# Now I want to create new variables
## First -- frequency each customer orders a given product
product_count_per_user <- full_order_data %>%
group_by(user_id, product_name) %>%
summarize(product_count = n()) %>%
arrange(user_id, desc(product_count))
## `summarise()` has grouped output by 'user_id'. You can override using the
## `.groups` argument.
head(product_count_per_user)
## # A tibble: 6 × 3
## # Groups: user_id [1]
## user_id product_name product_count
## <int> <chr> <int>
## 1 1 Original Beef Jerky 10
## 2 1 Soda 10
## 3 1 Pistachios 9
## 4 1 Organic String Cheese 8
## 5 1 Cinnamon Toast Crunch 3
## 6 1 Zero Calorie Cola 3
## Average time of day & DOW customer places an order
avg_order_time <- full_order_data %>%
group_by(user_id) %>%
summarize(avg_hour = mean(order_hour_of_day),
avg_day = mean(as.numeric(order_dow)))
head(avg_order_time)
## # A tibble: 6 × 3
## user_id avg_hour avg_day
## <int> <dbl> <dbl>
## 1 1 10.5 3.64
## 2 2 10.4 3.01
## 3 3 16.4 2.01
## 4 4 13.1 5.72
## 5 5 15.7 2.62
## 6 6 17 4.86
#plot -- heatmap
ggplot(avg_order_time, aes(x = avg_hour, y = avg_day)) +
geom_tile(aes(fill = after_stat(count)), stat = "bin2d") +
labs(x = "Average Hour of Day", y = "Average Day of Week",
title = "Average Order Time") +
scale_y_continuous(breaks = 0:6,
labels = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday")) +
theme_Publication() +
scale_fill_gradient(low = "blue", high = "red")
# What products are ordered more frequently based on DOW?
dow_freq_order <- full_order_data %>%
group_by(order_dow, product_name) %>%
summarize(product_count_day = n()) %>%
arrange(order_dow, desc(product_count_day))
## `summarise()` has grouped output by 'order_dow'. You can override using the
## `.groups` argument.
head(dow_freq_order)
## # A tibble: 6 × 3
## # Groups: order_dow [1]
## order_dow product_name product_count_day
## <fct> <chr> <int>
## 1 Saturday Banana 96769
## 2 Saturday Bag of Organic Bananas 71493
## 3 Saturday Organic Baby Spinach 54914
## 4 Saturday Organic Strawberries 53831
## 5 Saturday Organic Hass Avocado 43944
## 6 Saturday Organic Avocado 39846
# Top 3 products for each day of the week
dow_freq_order %>%
group_by(order_dow) %>%
top_n(3, product_count_day) %>%
ggplot(., aes(x = reorder(product_name, -product_count_day), y = product_count_day)) +
geom_bar(stat = "identity", fill = "skyblue") +
labs(x = "Product Name", y = "Product Count",
title = "Top 3 Products Ordered by Day of Week") +
theme_clean() +
scale_fill_Publication() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
facet_wrap(~ order_dow, nrow = 2, scales = "free_x")
dow_freq_order %>%
group_by(order_dow) %>%
top_n(6, product_count_day) %>%
ggplot(., aes(x = reorder(product_name, -product_count_day), y = product_count_day)) +
geom_bar(stat = "identity", fill = "skyblue") +
labs(x = "Product Name", y = "Product Count",
title = "Top 3 Products Ordered by Day of Week") +
theme_clean() +
scale_fill_Publication() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
facet_wrap(~ order_dow, nrow = 2, scales = "free_x")
# What products are ordered more frequently based on time of day?
hour_freq_order <- full_order_data %>%
group_by(order_hour_of_day, product_name) %>%
summarize(product_count_hour = n()) %>%
arrange(order_hour_of_day, desc(product_count_hour))
## `summarise()` has grouped output by 'order_hour_of_day'. You can override using
## the `.groups` argument.
head(hour_freq_order)
## # A tibble: 6 × 3
## # Groups: order_hour_of_day [1]
## order_hour_of_day product_name product_count_hour
## <int> <chr> <int>
## 1 0 Banana 2815
## 2 0 Bag of Organic Bananas 2712
## 3 0 Organic Strawberries 1839
## 4 0 Organic Baby Spinach 1768
## 5 0 Organic Hass Avocado 1398
## 6 0 Organic Avocado 1176
# Top 3 products for each hour of the day
hour_freq_order %>%
group_by(order_hour_of_day) %>%
top_n(3, product_count_hour) %>%
ggplot(., aes(x = reorder(product_name, -product_count_hour), y = product_count_hour)) +
geom_bar(stat = "identity", fill = "skyblue") +
labs(x = "Product Name", y = "Product Count",
title = "Top 3 Products Ordered by Hour of Day") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8)) +
facet_wrap(~ order_hour_of_day, nrow = 2, scales = "free_x")
# Calculate order size
order_size <- full_order_data %>%
group_by(order_id) %>%
summarize(order_size = n())
head(order_size)
## # A tibble: 6 × 2
## order_id order_size
## <int> <int>
## 1 2 9
## 2 3 8
## 3 4 13
## 4 5 26
## 5 6 3
## 6 7 2
#plot
# Aggregate order size at the customer level and calculate average order size
avg_order_size <- order_size %>%
inner_join(full_order_data, by = "order_id") %>%
group_by(user_id) %>%
summarize(avg_order_size = mean(order_size))
head(avg_order_size)
## # A tibble: 6 × 2
## user_id avg_order_size
## <int> <dbl>
## 1 1 6.25
## 2 2 16.1
## 3 3 7.89
## 4 4 4.56
## 5 5 10.0
## 6 6 5.29
# Which products are reordered the most?
reorder_counts <- full_order_data %>%
group_by(product_name) %>%
summarize(total_reorders = sum(reordered)) %>%
arrange(desc(total_reorders))
head(reorder_counts)
## # A tibble: 6 × 2
## product_name total_reorders
## <chr> <int>
## 1 Banana 398609
## 2 Bag of Organic Bananas 315913
## 3 Organic Strawberries 205845
## 4 Organic Baby Spinach 186884
## 5 Organic Hass Avocado 170131
## 6 Organic Avocado 134044
# plot
reorder_counts %>%
slice(1:20) %>%
ggplot(., aes(x = reorder(product_name, -total_reorders), y = total_reorders,
fill = as.factor(product_name))) +
geom_bar(stat = "identity") +
labs(title = "Product Reorder Frequency",
x = "Name of Product",
y = "Total Reorders") +
theme_Publication() +
scale_colour_Publication() +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1))
# Product reorder ratio, per user
product_reorder_ratio <- full_order_data %>%
group_by(user_id, product_name) %>%
summarize(product_reorder_ratio = sum(reordered) / n()) %>%
arrange(user_id, desc(product_reorder_ratio))
## `summarise()` has grouped output by 'user_id'. You can override using the
## `.groups` argument.
product_reorder_ratio
## # A tibble: 13,307,953 × 3
## # Groups: user_id [206,209]
## user_id product_name product_reorder_ratio
## <int> <chr> <dbl>
## 1 1 Original Beef Jerky 0.9
## 2 1 Soda 0.9
## 3 1 Pistachios 0.889
## 4 1 Organic String Cheese 0.875
## 5 1 Cinnamon Toast Crunch 0.667
## 6 1 Zero Calorie Cola 0.667
## 7 1 Aged White Cheddar Popcorn 0.5
## 8 1 Bag of Organic Bananas 0.5
## 9 1 Organic Half & Half 0.5
## 10 1 XL Pick-A-Size Paper Towel Rolls 0.5
## # ℹ 13,307,943 more rows
# Distributions of all the variables. Try to replicate that cool graph of which products are linked
# Bonus: Distribution of 'reordered' outcome variable
full_order_data %>%
mutate(reordered = factor(reordered,
levels = c(0,1),
labels = c("No", "Yes"))) %>%
ggplot(., aes(reordered, fill = reordered)) +
geom_bar() + labs(x = "Reordered Product?",
title = "Distribution of Reordering Products") +
scale_x_discrete(labels = c("No", "Yes")) +
theme_Publication() +
scale_colour_Publication() +
theme(legend.position = "none" )
# What is the relationship between the order in which an item is added to the cart & reordering it?
reorder_rate_by_position <- full_order_data %>%
group_by(add_to_cart_order) %>%
summarize(reorder_rate = mean(reordered))
# plot
ggplot(reorder_rate_by_position, aes(x = add_to_cart_order, y = reorder_rate)) +
geom_line() +
geom_point() +
labs(x = "Cart Position", y = "Reorder Rate", title = "Reorder Rate by Cart Position") +
theme_Publication() +
scale_fill_Publication()
# Logarithmic relationship until cart has ~50 items -- curious to see if these are just some outlier cases
# All different order sizes -- ranges from 1:145 items in a single order
range(full_order_data$add_to_cart_order)
## [1] 1 145
# Filter observations for orders of over 50 items
full_order_data %>%
filter(add_to_cart_order > 50) %>%
group_by(order_id)
## # A tibble: 26,359 × 12
## # Groups: order_id [3,081]
## order_id user_id eval_set order_number order_dow order_hour_of_day
## <int> <int> <chr> <int> <fct> <int>
## 1 3306184 133 prior 17 Wednesday 3
## 2 3306184 133 prior 17 Wednesday 3
## 3 3306184 133 prior 17 Wednesday 3
## 4 3306184 133 prior 17 Wednesday 3
## 5 3306184 133 prior 17 Wednesday 3
## 6 3306184 133 prior 17 Wednesday 3
## 7 3306184 133 prior 17 Wednesday 3
## 8 3306184 133 prior 17 Wednesday 3
## 9 1730857 153 prior 9 Sunday 14
## 10 1730857 153 prior 9 Sunday 14
## # ℹ 26,349 more rows
## # ℹ 6 more variables: days_since_prior_order <dbl>, add_to_cart_order <int>,
## # reordered <int>, product_name <chr>, aisle <chr>, department <chr>
# Only 26,359 observations for 3,081 orders..
3081/32434489
## [1] 9.499148e-05
# That's only 0.0095% of our total orders -- so we only care about orders below this threshold
# Redefine variable to filter only orders with 50 or less items in cart
reorder_rate_by_position <- reorder_rate_by_position %>%
filter(add_to_cart_order <= 50)
# New plot -- looks much better. We now see a clear relationship b/w cart position & reordering
ggplot(reorder_rate_by_position, aes(x = add_to_cart_order, y = reorder_rate)) +
geom_line() +
geom_point() +
labs(x = "Cart Position", y = "Reorder Rate", title = "Reorder Rate by Cart Position") +
theme_Publication() +
scale_fill_Publication()
head(reorder_rate_by_position)
## # A tibble: 6 × 2
## add_to_cart_order reorder_rate
## <int> <dbl>
## 1 1 0.678
## 2 2 0.676
## 3 3 0.658
## 4 4 0.637
## 5 5 0.617
## 6 6 0.600
# How it looks before merging
head(full_order_data)
## order_id user_id eval_set order_number order_dow order_hour_of_day
## 1 2539329 1 prior 1 Monday 8
## 2 2539329 1 prior 1 Monday 8
## 3 2539329 1 prior 1 Monday 8
## 4 2539329 1 prior 1 Monday 8
## 5 2539329 1 prior 1 Monday 8
## 6 2398795 1 prior 2 Tuesday 7
## days_since_prior_order add_to_cart_order reordered
## 1 NA 1 0
## 2 NA 2 0
## 3 NA 3 0
## 4 NA 4 0
## 5 NA 5 0
## 6 15 1 1
## product_name aisle department
## 1 Soda soft drinks beverages
## 2 Organic Unsweetened Vanilla Almond Milk soy lactosefree dairy eggs
## 3 Original Beef Jerky popcorn jerky snacks
## 4 Aged White Cheddar Popcorn popcorn jerky snacks
## 5 XL Pick-A-Size Paper Towel Rolls paper goods household
## 6 Soda soft drinks beverages
full_order_data <- full_order_data %>%
left_join(., product_count_per_user, by = c("user_id", "product_name")) %>%
left_join(., avg_order_time, by = "user_id") %>%
left_join(., dow_freq_order, by = c("order_dow", "product_name")) %>%
left_join(., hour_freq_order, by = c("order_hour_of_day", "product_name")) %>%
left_join(., order_size, by = "order_id") %>%
left_join(., avg_order_size, by = "user_id") %>%
left_join(., reorder_counts, by = "product_name") %>%
left_join(., product_reorder_ratio, by = c("user_id", "product_name")) %>%
left_join(., reorder_rate_by_position, by = "add_to_cart_order")
# New final data set -- looks good
head(full_order_data)
## order_id user_id eval_set order_number order_dow order_hour_of_day
## 1 2539329 1 prior 1 Monday 8
## 2 2539329 1 prior 1 Monday 8
## 3 2539329 1 prior 1 Monday 8
## 4 2539329 1 prior 1 Monday 8
## 5 2539329 1 prior 1 Monday 8
## 6 2398795 1 prior 2 Tuesday 7
## days_since_prior_order add_to_cart_order reordered
## 1 NA 1 0
## 2 NA 2 0
## 3 NA 3 0
## 4 NA 4 0
## 5 NA 5 0
## 6 15 1 1
## product_name aisle department
## 1 Soda soft drinks beverages
## 2 Organic Unsweetened Vanilla Almond Milk soy lactosefree dairy eggs
## 3 Original Beef Jerky popcorn jerky snacks
## 4 Aged White Cheddar Popcorn popcorn jerky snacks
## 5 XL Pick-A-Size Paper Towel Rolls paper goods household
## 6 Soda soft drinks beverages
## product_count avg_hour avg_day product_count_day product_count_hour
## 1 10 10.54237 3.644068 5953 2135
## 2 1 10.54237 3.644068 2019 885
## 3 10 10.54237 3.644068 1159 423
## 4 2 10.54237 3.644068 313 125
## 5 2 10.54237 3.644068 177 81
## 6 10 10.54237 3.644068 5499 687
## order_size avg_order_size total_reorders product_reorder_ratio reorder_rate
## 1 5 6.254237 27791 0.9 0.6775329
## 2 5 6.254237 12923 0.0 0.6762507
## 3 5 6.254237 4797 0.9 0.6580367
## 4 5 6.254237 1360 0.5 0.6369578
## 5 5 6.254237 536 0.5 0.6173831
## 6 6 6.254237 27791 0.9 0.6775329
# Remove extra objects from environment
rm(hour_freq_order, order_size, product_count_per_user, product_reorder_ratio,
reorder_counts, reorder_rate_by_position, avg_order_size, avg_order_time,
dow_freq_order, products, aisles, depts)
# Summary statistics for all variables in final data set
summary(full_order_data)
## order_id user_id eval_set order_number
## Min. : 2 Min. : 1 Length:32434489 Min. : 1.00
## 1st Qu.: 855943 1st Qu.: 51421 Class :character 1st Qu.: 5.00
## Median :1711048 Median :102611 Mode :character Median :11.00
## Mean :1710749 Mean :102937 Mean :17.14
## 3rd Qu.:2565514 3rd Qu.:154391 3rd Qu.:24.00
## Max. :3421083 Max. :206209 Max. :99.00
##
## order_dow order_hour_of_day days_since_prior_order add_to_cart_order
## Saturday :6209666 Min. : 0.00 Min. : 0.0 Min. : 1.000
## Sunday :5665856 1st Qu.:10.00 1st Qu.: 5.0 1st Qu.: 3.000
## Monday :4217798 Median :13.00 Median : 8.0 Median : 6.000
## Tuesday :3844117 Mean :13.42 Mean :11.1 Mean : 8.351
## Wednesday:3787215 3rd Qu.:16.00 3rd Qu.:15.0 3rd Qu.: 11.000
## Thursday :4209533 Max. :23.00 Max. :30.0 Max. :145.000
## Friday :4500304 NA's :2078068
## reordered product_name aisle department
## Min. :0.0000 Length:32434489 Length:32434489 Length:32434489
## 1st Qu.:0.0000 Class :character Class :character Class :character
## Median :1.0000 Mode :character Mode :character Mode :character
## Mean :0.5897
## 3rd Qu.:1.0000
## Max. :1.0000
##
## product_count avg_hour avg_day product_count_day
## Min. : 1.000 Min. : 0.00 Min. :1.000 Min. : 1
## 1st Qu.: 2.000 1st Qu.:12.20 1st Qu.:3.244 1st Qu.: 237
## Median : 4.000 Median :13.41 Median :3.754 Median : 981
## Mean : 7.621 Mean :13.42 Mean :3.739 Mean : 5411
## 3rd Qu.:10.000 3rd Qu.:14.62 3rd Qu.:4.226 3rd Qu.: 4206
## Max. :99.000 Max. :23.00 Max. :7.000 Max. :96769
##
## product_count_hour order_size avg_order_size total_reorders
## Min. : 1 Min. : 1.0 Min. : 1.00 Min. : 0
## 1st Qu.: 96 1st Qu.: 9.0 1st Qu.:10.50 1st Qu.: 832
## Median : 414 Median : 14.0 Median :14.55 Median : 3873
## Mean : 2399 Mean : 15.7 Mean :15.70 Mean : 26593
## 3rd Qu.: 1773 3rd Qu.: 21.0 3rd Qu.:19.63 3rd Qu.: 18541
## Max. :40731 Max. :145.0 Max. :78.55 Max. :398609
##
## product_reorder_ratio reorder_rate
## Min. :0.0000 Min. :0.397
## 1st Qu.:0.5000 1st Qu.:0.541
## Median :0.7500 Median :0.600
## Mean :0.5897 Mean :0.590
## 3rd Qu.:0.9000 3rd Qu.:0.658
## Max. :0.9899 Max. :0.678
## NA's :26359
# Class of each variable
full_order_data %>%
sapply(class)
## order_id user_id eval_set
## "integer" "integer" "character"
## order_number order_dow order_hour_of_day
## "integer" "factor" "integer"
## days_since_prior_order add_to_cart_order reordered
## "numeric" "integer" "integer"
## product_name aisle department
## "character" "character" "character"
## product_count avg_hour avg_day
## "integer" "numeric" "numeric"
## product_count_day product_count_hour order_size
## "integer" "integer" "integer"
## avg_order_size total_reorders product_reorder_ratio
## "numeric" "integer" "numeric"
## reorder_rate
## "numeric"
# Correlation Matrix of Variables
corr_matrix <- full_order_data %>%
mutate(order_dow_numeric = as.numeric(order_dow)) %>%
{cor(.[,-c(1, 2, 3, 5, 10, 11, 12)],
use = "pairwise.complete.obs")}
# Visualize Corr Matrix
corrplot(corr_matrix, method="shade", type="lower",
title="Instacart Correlation Matrix",
tl.cex = 0.75, tl.col = "black", tl.srt = 45,
number.cex = 0.5,
mar = c(0,0,1,0))
Mostly uncorrelated variables – some that are correlated tend to be because they were created by that variable (reordered / product_reorder_ratio). We do find that reorder_rate & add_to_cart_order have a negative relationship, alongside reorder_rate & order_size. Total_reorders has a positive correlation w/ product_count_day & product_count_hour.
For our models – we have a huge data set (32m observations). I tried running it with the full dataset initally, and it takes waaay too long as it is computationally very expensive on my macbook. Instead, I decided to first subset a random sample of the data so it moves down from 32m obs -> 8m obs, 1/4 of the full data set. I understand by doing this we lose a lot of our data, but it is still large enough where I don’t expect this to be enough of a problem – our data is still very low-dimensional.
After subsetting the data, I then employed mini-batch training, which includes taking subsets of our shrunken data set defined as “batches” and apply regression models to this subset. We then iterate through this and update our model using each iteration. We run this algorithm until we hit the total number of batches, which is defined by the size of our training data set divided by the batch size (an arbitrary number we set – not too high or else the model trains very slow).
# data set is huge -- need to split it up randomly and rerun it -- use mini-batch training
# First -- create a subset of the original data set 1/4 of the size (32m -> 8m)
set.seed(444)
full_order_data <- full_order_data[sample(nrow(full_order_data), 8000000),]
# Second -- split original data set into training & test data (80/20)
train_index <- sample(1:nrow(full_order_data), 0.8*nrow(full_order_data))
train_data <- full_order_data[train_index,]
test_data <- full_order_data[-train_index,]
# Define batch size & num of batches
batch_size <- 50000
num_batches <- ceiling(nrow(train_data) / batch_size)
# Replace NA's with 0s in data, since they just correspond to # of days since previous order -- so 0 makes sense
# First, on the training data
train_data <- train_data %>%
mutate_all(~ifelse(is.na(.), 0, .))
# Test to see if it worked -- yes it did
sum(is.na(train_data)) # 0
## [1] 0
# Now same on test data
test_data <- test_data %>%
mutate_all(~ifelse(is.na(.), 0, .))
# Again, we check -- looks good
sum(is.na(test_data)) # 0
## [1] 0
# Summary statistics for all variables in final data subset
summary(full_order_data)
## order_id user_id eval_set order_number
## Min. : 2 Min. : 1 Length:8000000 Min. : 1.00
## 1st Qu.: 855816 1st Qu.: 51344 Class :character 1st Qu.: 5.00
## Median :1711068 Median :102562 Mode :character Median :11.00
## Mean :1710795 Mean :102907 Mean :17.13
## 3rd Qu.:2565888 3rd Qu.:154381 3rd Qu.:24.00
## Max. :3421083 Max. :206209 Max. :99.00
##
## order_dow order_hour_of_day days_since_prior_order add_to_cart_order
## Saturday :1531940 Min. : 0.00 Min. : 0.0 Min. : 1.000
## Sunday :1399493 1st Qu.:10.00 1st Qu.: 5.0 1st Qu.: 3.000
## Monday :1039624 Median :13.00 Median : 8.0 Median : 6.000
## Tuesday : 947785 Mean :13.42 Mean :11.1 Mean : 8.353
## Wednesday: 933892 3rd Qu.:16.00 3rd Qu.:15.0 3rd Qu.: 11.000
## Thursday :1037663 Max. :23.00 Max. :30.0 Max. :143.000
## Friday :1109603 NA's :512792
## reordered product_name aisle department
## Min. :0.0000 Length:8000000 Length:8000000 Length:8000000
## 1st Qu.:0.0000 Class :character Class :character Class :character
## Median :1.0000 Mode :character Mode :character Mode :character
## Mean :0.5895
## 3rd Qu.:1.0000
## Max. :1.0000
##
## product_count avg_hour avg_day product_count_day
## Min. : 1.000 Min. : 0.00 Min. :1.000 Min. : 1
## 1st Qu.: 2.000 1st Qu.:12.20 1st Qu.:3.244 1st Qu.: 237
## Median : 4.000 Median :13.41 Median :3.754 Median : 982
## Mean : 7.624 Mean :13.42 Mean :3.739 Mean : 5423
## 3rd Qu.:10.000 3rd Qu.:14.62 3rd Qu.:4.226 3rd Qu.: 4216
## Max. :99.000 Max. :23.00 Max. :7.000 Max. :96769
##
## product_count_hour order_size avg_order_size total_reorders
## Min. : 1 Min. : 1.00 Min. : 1.00 Min. : 0
## 1st Qu.: 96 1st Qu.: 9.00 1st Qu.:10.50 1st Qu.: 832
## Median : 414 Median : 14.00 Median :14.56 Median : 3876
## Mean : 2403 Mean : 15.71 Mean :15.70 Mean : 26651
## 3rd Qu.: 1774 3rd Qu.: 21.00 3rd Qu.:19.63 3rd Qu.: 18541
## Max. :40731 Max. :145.00 Max. :78.55 Max. :398609
##
## product_reorder_ratio reorder_rate
## Min. :0.0000 Min. :0.397
## 1st Qu.:0.5000 1st Qu.:0.541
## Median :0.7500 Median :0.600
## Mean :0.5897 Mean :0.590
## 3rd Qu.:0.9000 3rd Qu.:0.658
## Max. :0.9899 Max. :0.678
## NA's :6512
# Loop for getting mean & SD for all variables
mean_values <- c()
sd_values <- c()
for (i in colnames(full_order_data)){
# Check if the column is numeric or integer
if (is.numeric(full_order_data[[i]]) || is.integer(full_order_data[[i]])){
column_mean <- mean(full_order_data[[i]], na.rm = TRUE)
column_sd <- sd(full_order_data[[i]], na.rm = TRUE)
mean_values <- c(mean_values, column_mean)
sd_values <- c(sd_values, column_sd)
}
}
# Maybe need to deal with NA's here
mean_sd_df <- data.frame("Variables" = colnames(full_order_data)[-c(3,5,10:12)],
"Mean" = c(mean_values),
"SD" = c(sd_values))
mean_sd_df
## Variables Mean SD
## 1 order_id 1.710795e+06 9.873501e+05
## 2 user_id 1.029068e+05 5.948021e+04
## 3 order_number 1.713174e+01 1.752758e+01
## 4 order_hour_of_day 1.342473e+01 4.247423e+00
## 5 days_since_prior_order 1.110160e+01 8.775010e+00
## 6 add_to_cart_order 8.352539e+00 7.128817e+00
## 7 reordered 5.895110e-01 4.919226e-01
## 8 product_count 7.623776e+00 9.816934e+00
## 9 avg_hour 1.342474e+01 1.876602e+00
## 10 avg_day 3.738628e+00 8.436293e-01
## 11 product_count_day 5.423167e+03 1.243822e+04
## 12 product_count_hour 2.402714e+03 5.641647e+03
## 13 order_size 1.570720e+01 9.537659e+00
## 14 avg_order_size 1.570366e+01 7.339443e+00
## 15 total_reorders 2.665121e+04 6.543087e+04
## 16 product_reorder_ratio 5.896708e-01 3.647764e-01
## 17 reorder_rate 5.898350e-01 6.937618e-02
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(class)
### Logit ###
#############
# Mini-batch training loop
for (i in 1:num_batches) {
# Calculate start and end indices for the current batch
start_index <- ((i - 1) * batch_size) + 1
end_index <- min(i * batch_size, nrow(train_data))
# Read the current batch of training data
batch_data <- train_data[start_index:end_index, ]
insta_logit <- glm(reordered ~ order_number + order_dow + order_hour_of_day + days_since_prior_order +
add_to_cart_order + avg_hour + avg_day + product_count + product_count_day + product_count_hour +
order_size + avg_order_size + total_reorders + product_reorder_ratio + reorder_rate,
data = batch_data, family = "binomial")
}
# Summary statistics -- we find that order_number, order_dowTueday (relative to Saturday), days_since_prior_order, add_to_cart_order, avg_hour, order_size, avg_order_size, and product_reorder_rate are all statistically significant at all standard significance levels.
summary(insta_logit)
##
## Call:
## glm(formula = reordered ~ order_number + order_dow + order_hour_of_day +
## days_since_prior_order + add_to_cart_order + avg_hour + avg_day +
## product_count + product_count_day + product_count_hour +
## order_size + avg_order_size + total_reorders + product_reorder_ratio +
## reorder_rate, family = "binomial", data = batch_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.117e+00 4.524e-01 -11.311 < 2e-16 ***
## order_number 5.868e-02 1.196e-03 49.076 < 2e-16 ***
## order_dow -5.960e-04 7.853e-03 -0.076 0.93950
## order_hour_of_day 2.097e-03 3.806e-03 0.551 0.58165
## days_since_prior_order 5.977e-02 1.740e-03 34.356 < 2e-16 ***
## add_to_cart_order -5.132e-02 6.854e-03 -7.487 7.03e-14 ***
## avg_hour 4.391e-03 8.571e-03 0.512 0.60844
## avg_day 6.749e-03 1.845e-02 0.366 0.71452
## product_count -7.485e-03 2.734e-03 -2.737 0.00620 **
## product_count_day 7.005e-08 4.876e-06 0.014 0.98854
## product_count_hour 1.173e-05 7.874e-06 1.490 0.13626
## order_size 1.311e-02 2.853e-03 4.594 4.34e-06 ***
## avg_order_size 8.929e-03 3.147e-03 2.838 0.00454 **
## total_reorders -7.482e-07 1.064e-06 -0.703 0.48195
## product_reorder_ratio 7.359e+00 8.529e-02 86.279 < 2e-16 ***
## reorder_rate -8.158e-01 6.487e-01 -1.258 0.20855
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 67613 on 49999 degrees of freedom
## Residual deviance: 31101 on 49984 degrees of freedom
## AIC: 31133
##
## Number of Fisher Scoring iterations: 6
## Misclassificiation error rate
logit_preds <- predict(insta_logit, newdata = test_data, type = "response")
# Set A threshold value = 0.5
logit_threshold=ifelse(logit_preds>0.5,"1","0")
# Confusion Matrix
table(logit_threshold, test_data$reordered)
##
## logit_threshold 0 1
## 0 502815 77995
## 1 153862 865328
# Test Error (diagonal/total)
mean(logit_threshold==test_data$reordered, na.rm = T) # logit model correctly predicts 86.37% of the time
## [1] 0.8550894
# I.e. misclas error rate = 13.63%
logit_test_error <- 1-mean(logit_threshold==test_data$reordered, na.rm = T)
# ROC Curve & AUC Value
roc_curve_logit <- roc(test_data$reordered, logit_preds)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_curve_logit, main = "ROC Curve", col = "blue")
auc_logit <- auc(roc_curve_logit)
print(paste("AUC:", auc_logit))
## [1] "AUC: 0.933023138690563"
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
### LDA & QDA ###
#################
set.seed(101)
for (i in 1:num_batches) {
# Calculate start and end indices for the current batch
start_index <- ((i - 1) * batch_size) + 1
end_index <- min(i * batch_size, nrow(train_data))
# Read the current batch of training data
batch_data <- train_data[start_index:end_index, ]
insta_LDA <- lda(reordered ~ order_number + order_dow + order_hour_of_day + days_since_prior_order +
add_to_cart_order + avg_hour + avg_day + product_count + product_count_day + product_count_hour +
order_size + avg_order_size + total_reorders + product_reorder_ratio + reorder_rate,
data = batch_data)
insta_QDA <- qda(reordered ~ order_number + order_dow + order_hour_of_day + days_since_prior_order +
add_to_cart_order + avg_hour + avg_day + product_count + product_count_day + product_count_hour +
order_size + avg_order_size + total_reorders + product_reorder_ratio + reorder_rate,
data = batch_data)
}
lda.pred = predict(insta_LDA, newdata = test_data, type="response")
qda.pred = predict(insta_QDA, newdata = test_data, type="response")
# Plot of LDA model
plot(insta_LDA)
# Need to access column 'class' for LDA/QDA to create confusion matrix and calculate test error
table(lda.pred$class, test_data$reordered)
##
## 0 1
## 0 465639 50118
## 1 191038 893205
mean(lda.pred$class==test_data$reordered, na.rm = T) # LDA Correctly predicts 84.93% of time - misclassification error rate of 15.07%.
## [1] 0.8492775
lda_test_error <- 1-mean(lda.pred$class==test_data$reordered, na.rm = T)
table(qda.pred$class, test_data$reordered) # QDA correctly predicts 83.00% of the time - misclas error rate = 17.00%.
##
## 0 1
## 0 594821 319621
## 1 61856 623702
mean(qda.pred$class==test_data$reordered, na.rm = T)
## [1] 0.7615769
qda_test_error <- 1-mean(qda.pred$class==test_data$reordered, na.rm = T)
# AUC (LDA)
roc_curve_lda <- roc(test_data$reordered, as.numeric(lda.pred$class))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_curve_lda, main = "ROC Curve", col = "blue")
auc_lda <- auc(roc_curve_lda)
print(paste("AUC:", auc_lda))
## [1] "AUC: 0.827977280899065"
# AUC (QDA)
roc_curve_qda <- roc(test_data$reordered, as.numeric(qda.pred$class))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_curve_qda, main = "ROC Curve", col = "blue")
auc_qda <- auc(roc_curve_qda)
print(paste("AUC:", auc_qda))
## [1] "AUC: 0.78348998419337"
library(class)
set.seed(18)
# kNN
#only gonna use significant predictors from previous models in KNN model to reduce feature space
predictors <- c("order_number", "order_dow", "days_since_prior_order",
"add_to_cart_order", "avg_hour", "order_size", "avg_order_size", "product_reorder_ratio")
# Create a subset of data to use , since the larger the data points the more computationally expensive kNN becomes
knn_subset_data <- train_data[sample(nrow(train_data), 300000),]
knn_train_index <- sample(1:nrow(knn_subset_data), 0.8*nrow(knn_subset_data))
knn_train_data <- knn_subset_data[knn_train_index,]
knn_test_data <- knn_subset_data[-knn_train_index,]
# Experimented with different batch sizes
knn_batch_size = 2500
knn_num_batches <- ceiling(nrow(knn_train_data) / knn_batch_size)
## Find optimal K value for kNN using 10-fold CV
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
## The following object is masked from 'package:httr':
##
## progress
# Define range of k values
k_values <- c(5, 7, 9, 11) # Example range of odds 5-11.
# Define the number of folds for cross-validation
num_folds <- 5 # Example: 10-fold cross-validation
# Define the control parameters for cross-validation
ctrl <- trainControl(method = "cv", # Use cross-validation
number = num_folds) # Number of folds
# Train kNN model using different values of k
knn_train <- train(as.factor(reordered) ~ order_number + order_dow + days_since_prior_order +
add_to_cart_order + avg_hour + order_size + avg_order_size +
product_reorder_ratio,
data = knn_train_data, # Training data
method = "knn", # kNN method
trControl = ctrl, # Cross-validation control
tuneGrid = data.frame(k = k_values)) # Grid of k values
# Print the results
print(knn_train) # best results from k = 1:5 -- 5 had highest accuracy (0.7162) -- then we tried 5, 7, 9, & 11 -- 11 had the highest
## k-Nearest Neighbors
##
## 240000 samples
## 8 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 192000, 192001, 192000, 191999, 192000
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.7178208 0.4015090
## 7 0.7247958 0.4140414
## 9 0.7280875 0.4195463
## 11 0.7315166 0.4257158
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 11.
# Plot performance metric vs. k values
plot(knn_train)
best_neighbors <- knn_train$bestTune
## Loop through batches
for (i in 1:knn_num_batches) {
# Get indices for the current batch
start_index <- (i - 1) * knn_batch_size + 1
end_index <- min(i * knn_batch_size, nrow(knn_train_data))
# Read the current batch of training data
knn_batch_data <- knn_train_data[start_index:end_index, ]
# Train kNN model on current batch
knn_model <- knn(train = knn_batch_data[, predictors],
test = knn_test_data[, predictors],
cl = knn_batch_data$reordered,
k = best_neighbors[1,1])
}
table(knn_model, knn_test_data$reordered) # KNN with K=11 confusion matrix
##
## knn_model 0 1
## 0 13271 6147
## 1 11388 29194
mean(knn_model==knn_test_data$reordered, na.rm = T) # Prediction accuracy of 70.51%.
## [1] 0.70775
knn_test_error <- 1-mean(knn_model==knn_test_data$reordered, na.rm = T)
# AUC Curve
roc_curve_knn <- roc(knn_test_data$reordered, as.numeric(knn_model))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_curve_knn, main = "ROC Curve", col = "blue")
auc_knn <- auc(roc_curve_knn)
print(paste("AUC:", auc_knn))
## [1] "AUC: 0.682123414096897"
library(tree)
### Classification Tree ###
###########################
set.seed(99)
for (i in 1:num_batches) {
# Calculate start and end indices for the current batch
start_index <- ((i - 1) * batch_size) + 1
end_index <- min(i * batch_size, nrow(train_data))
# Read the current batch of training data
batch_data <- train_data[start_index:end_index, ]
insta_tree <- tree(as.factor(reordered) ~ order_number + order_dow + order_hour_of_day +
days_since_prior_order + add_to_cart_order + avg_hour + avg_day +
product_count_day + product_count_hour + order_size + avg_order_size +
total_reorders + product_reorder_ratio + reorder_rate + product_count,
data = batch_data)
}
# predicting values in our test split using our subsetted model
tree.pred <- predict(insta_tree, newdata = test_data, type = "class")
# confusion matrix
table(tree.pred, test_data$reordered) # correctly predicts 87.74% of the time. Misclas error rate of 12.26%.
##
## tree.pred 0 1
## 0 488289 27711
## 1 168388 915612
mean(tree.pred==test_data$reordered)
## [1] 0.8774381
# Cross-Validation on our Classification Tree
cv.insta <- cv.tree(insta_tree, FUN = prune.misclass) # using cv to determine optimal level of tree complexity
cv.insta$size # tree sizes
## [1] 6 3 2 1
cv.insta$dev # tree size of 3 or 6 terminal nodes minimizes CV classification error
## [1] 6077 6077 8233 20401
cv.insta$k # alpha value of 0 -- no complexity penalty (unpruned tree minimizes our CV classification error)
## [1] -Inf 0 2156 12168
# Plotting CV Trees
par(mfrow = c(1, 2))
plot(cv.insta$size , cv.insta$dev, type = "b") # Plot CV Classification error against tree size
min_dev = which.min(cv.insta$dev); min_dev # CV Classification error minimized at the first value (cv.insta$dev[1])
## [1] 1
points(cv.insta$size[min_dev], cv.insta$dev[min_dev], col = "red", cex = 2, pch=20)
plot(cv.insta$k, cv.insta$dev, type = "b") # Plot CV Classification error against alpha values
points(0, cv.insta$dev[min_dev], col = "red", cex = 2, pch = 20)
# Pruning of our tree
prune.insta <- prune.misclass(insta_tree , best = 6)
plot(prune.insta); text(prune.insta , pretty = 0, cex = 0.5) # tree with 6 terminal nodes. We see the following vars: product_reorder_ratio & order_number
tree.pred2 <- predict(prune.insta, test_data, # predicting values in our test split using our subsetted model
type = "class")
table(tree.pred2, test_data$reordered) # same table as before -- test error of 87.74%
##
## tree.pred2 0 1
## 0 488289 27711
## 1 168388 915612
mean(tree.pred2==test_data$reordered)
## [1] 0.8774381
tree_test_error <- 1-mean(tree.pred2==test_data$reordered)
# AUC Curve
roc_curve_tree <- roc(test_data$reordered, as.numeric(tree.pred2))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_curve_tree, main = "ROC Curve", col = "blue")
auc_tree <- auc(roc_curve_tree)
print(paste("AUC:", auc_tree))
## [1] "AUC: 0.85709983427411"
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
### Random Forests ###
######################
set.seed(38)
# Tried with a smaller batch size to save time -- took 20 mins at 1/5 of the size -- over an hour for full size
rf_batch_size <- 10000
for (i in 1:num_batches) {
# Calculate start and end indices for the current batch
start_index <- ((i - 1) * rf_batch_size) + 1
end_index <- min(i * rf_batch_size, nrow(train_data))
# Read the current batch of training data
batch_data <- train_data[start_index:end_index, ]
insta_rf <- randomForest(as.factor(reordered) ~ order_number + order_dow + order_hour_of_day +
days_since_prior_order + add_to_cart_order + avg_hour + avg_day +
product_count_day + product_count_hour +order_size + avg_order_size +
total_reorders + product_reorder_ratio + reorder_rate + product_count,
data = batch_data)
}
# Variable importance
importance(insta_rf)
## MeanDecreaseGini
## order_number 598.29762
## order_dow 72.31885
## order_hour_of_day 100.60654
## days_since_prior_order 307.51266
## add_to_cart_order 102.96149
## avg_hour 150.68469
## avg_day 159.95777
## product_count_day 157.84163
## product_count_hour 154.91041
## order_size 118.35267
## avg_order_size 158.89244
## total_reorders 169.56384
## product_reorder_ratio 1245.54101
## reorder_rate 98.23815
## product_count 1232.94912
varImpPlot(insta_rf)
rf_preds <- predict(insta_rf, newdata = test_data,
type = "class")
table(rf_preds, test_data$reordered)
##
## rf_preds 0 1
## 0 502670 35540
## 1 154007 907783
# Misclassification Error Rate (test error) of 11.77%. Correctly predicts 88.23% of the values.
mean(rf_preds==test_data$reordered)
## [1] 0.8815331
rf_test_error <- 1-mean(rf_preds==test_data$reordered)
# AUC Curve
roc_curve_rf <- roc(test_data$reordered, as.numeric(rf_preds))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_curve_rf, main = "ROC Curve", col = "blue")
auc_rf <- auc(roc_curve_rf)
print(paste("AUC:", auc_rf))
## [1] "AUC: 0.863899969868076"
### Boosting ###
################
# Need to make a subset of our data to run on xgboost
set.seed(88)
# Create subset & indexes
xgboost_subset_data <- train_data[sample(nrow(train_data), 62500),]
xgboost_train_index <- sample(1:nrow(xgboost_subset_data), 0.8*nrow(xgboost_subset_data))
xgboost_train_data <- xgboost_subset_data[xgboost_train_index,]
xgboost_test_data <- xgboost_subset_data[-xgboost_train_index,]
# Train data subset
xgboost_train_data <- xgboost_train_data %>%
mutate(reordered = as.factor(reordered)) %>%
dplyr::select(order_number, order_dow, order_hour_of_day, days_since_prior_order,
add_to_cart_order, avg_hour, avg_day, product_count, product_count_day,
product_count_hour, order_size, avg_order_size, total_reorders,
product_reorder_ratio, reorder_rate, reordered) %>%
glimpse
## Rows: 50,000
## Columns: 16
## $ order_number <int> 2, 7, 3, 1, 27, 73, 18, 8, 2, 9, 6, 32, 52, 33,…
## $ order_dow <int> 4, 2, 1, 2, 2, 6, 4, 2, 3, 6, 1, 2, 1, 5, 5, 7,…
## $ order_hour_of_day <int> 9, 17, 15, 13, 18, 13, 22, 9, 6, 15, 7, 8, 13, …
## $ days_since_prior_order <dbl> 5, 25, 0, 0, 5, 2, 8, 30, 14, 7, 8, 22, 9, 14, …
## $ add_to_cart_order <int> 1, 13, 1, 10, 8, 4, 2, 9, 4, 7, 11, 21, 18, 15,…
## $ avg_hour <dbl> 11.36000, 15.95575, 15.77778, 13.88732, 14.2329…
## $ avg_day <dbl> 5.720000, 4.371681, 4.203704, 3.690141, 3.75000…
## $ product_count <int> 1, 1, 2, 1, 11, 1, 8, 1, 3, 6, 1, 35, 1, 20, 2,…
## $ product_count_day <int> 3425, 2007, 110, 1086, 51, 753, 109, 481, 667, …
## $ product_count_hour <int> 2315, 752, 58, 566, 20, 418, 11, 213, 41, 839, …
## $ order_size <int> 8, 14, 8, 27, 12, 14, 16, 18, 15, 13, 13, 23, 2…
## $ avg_order_size <dbl> 8.360000, 11.548673, 8.740741, 19.760563, 11.08…
## $ total_reorders <int> 22713, 7995, 382, 2581, 158, 2793, 318, 1086, 2…
## $ product_reorder_ratio <dbl> 0.0000000, 0.0000000, 0.5000000, 0.0000000, 0.9…
## $ reorder_rate <dbl> 0.6775329, 0.5247756, 0.6775329, 0.5510178, 0.5…
## $ reordered <fct> 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0,…
# Convert into numeric matrix to put into boosting function
xgboost_train_data_x <- as.matrix(xgboost_train_data[, -16])
xgboost_train_data_x <- apply(xgboost_train_data_x, 2, as.numeric)
# Test data subset
xgboost_test_data <- xgboost_test_data %>%
mutate(reordered = as.factor(reordered)) %>%
dplyr::select(order_number, order_dow, order_hour_of_day, days_since_prior_order,
add_to_cart_order, avg_hour, avg_day, product_count, product_count_day,
product_count_hour, order_size, avg_order_size, total_reorders,
product_reorder_ratio, reorder_rate, reordered) %>%
glimpse
## Rows: 12,500
## Columns: 16
## $ order_number <int> 22, 26, 12, 27, 3, 2, 14, 12, 74, 16, 23, 22, 1…
## $ order_dow <int> 4, 2, 7, 1, 4, 4, 2, 2, 6, 4, 3, 1, 2, 1, 4, 4,…
## $ order_hour_of_day <int> 17, 12, 16, 10, 12, 20, 14, 9, 16, 12, 14, 17, …
## $ days_since_prior_order <dbl> 5, 7, 7, 8, 30, 30, 7, 7, 2, 6, 4, 7, 10, 11, 1…
## $ add_to_cart_order <int> 1, 4, 2, 5, 3, 21, 6, 13, 4, 13, 11, 24, 5, 7, …
## $ avg_hour <dbl> 9.205263, 13.799686, 11.829787, 13.146269, 13.7…
## $ avg_day <dbl> 5.473684, 3.102121, 5.179331, 3.040299, 3.78082…
## $ product_count <int> 34, 1, 10, 30, 2, 1, 2, 3, 1, 13, 9, 3, 11, 8, …
## $ product_count_day <int> 2858, 398, 1095, 8211, 350, 23, 1159, 859, 3283…
## $ product_count_hour <int> 1613, 237, 620, 4563, 240, 4, 622, 403, 2170, 4…
## $ order_size <int> 5, 6, 21, 24, 7, 33, 16, 22, 6, 30, 13, 30, 5, …
## $ avg_order_size <dbl> 5.031579, 21.928515, 25.054711, 20.650746, 8.31…
## $ total_reorders <int> 15925, 613, 5638, 38544, 1457, 51, 3656, 3544, …
## $ product_reorder_ratio <dbl> 0.9705882, 0.0000000, 0.9000000, 0.9666667, 0.5…
## $ reorder_rate <dbl> 0.6775329, 0.6369578, 0.6762507, 0.6173831, 0.6…
## $ reordered <fct> 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0,…
# Convert into numeric matrix for test error rate calculation at end
xgboost_test_data_x <- as.matrix(xgboost_test_data[, -16])
xgboost_test_data_x <- apply(xgboost_test_data_x, 2, as.numeric)
library(caret)
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
# Tune parameters
grid_tune <- expand.grid(
nrounds = c(500, 1000, 1500), # number of trees
max_depth = c(2, 4, 6),
eta = c(0.1, 0.3), # could try c(0.025, 0.05, 0.1, 0.3) # learning rate
gamma = 0,
colsample_bytree = 1,
min_child_weight = 1,
subsample = 1
)
# Cross-validation
train_control <- trainControl(method = "cv",
number = 5,
verboseIter = TRUE,
allowParallel = TRUE)
xgb_tune <- train(x = xgboost_train_data_x,
y = xgboost_train_data[, 16],
trControl = train_control,
tuneGrid = grid_tune,
method = "xgbTree",
verbose = TRUE)
## + Fold1: eta=0.1, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## [07:40:29] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [07:40:29] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold1: eta=0.1, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## + Fold1: eta=0.1, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## [07:44:02] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [07:44:02] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold1: eta=0.1, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## + Fold1: eta=0.1, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## [07:50:29] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [07:50:30] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold1: eta=0.1, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## + Fold1: eta=0.3, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## [07:52:17] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [07:52:17] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold1: eta=0.3, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## + Fold1: eta=0.3, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## [07:55:26] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [07:55:26] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold1: eta=0.3, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## + Fold1: eta=0.3, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## [08:00:50] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [08:00:51] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold1: eta=0.3, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## + Fold2: eta=0.1, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## [08:02:54] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [08:02:54] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold2: eta=0.1, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## + Fold2: eta=0.1, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## [08:06:10] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [08:06:10] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold2: eta=0.1, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## + Fold2: eta=0.1, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## [08:10:35] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [08:10:36] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold2: eta=0.1, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## + Fold2: eta=0.3, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## [08:12:07] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [08:12:07] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold2: eta=0.3, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## + Fold2: eta=0.3, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## [08:15:12] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [08:15:13] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold2: eta=0.3, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## + Fold2: eta=0.3, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## [08:25:39] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [08:25:40] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold2: eta=0.3, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## + Fold3: eta=0.1, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## [08:28:02] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [08:28:03] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold3: eta=0.1, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## + Fold3: eta=0.1, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## [08:31:15] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [08:31:15] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold3: eta=0.1, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## + Fold3: eta=0.1, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## [08:36:21] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [08:36:21] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold3: eta=0.1, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## + Fold3: eta=0.3, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## [08:37:55] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [08:37:55] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold3: eta=0.3, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## + Fold3: eta=0.3, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## [08:40:51] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [08:40:51] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold3: eta=0.3, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## + Fold3: eta=0.3, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## [09:03:10] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [09:03:10] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold3: eta=0.3, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## + Fold4: eta=0.1, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## [09:04:41] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [09:04:41] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold4: eta=0.1, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## + Fold4: eta=0.1, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## [09:07:33] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [09:07:33] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold4: eta=0.1, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## + Fold4: eta=0.1, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## [09:11:55] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [09:11:55] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold4: eta=0.1, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## + Fold4: eta=0.3, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## [09:14:49] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [09:14:50] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold4: eta=0.3, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## + Fold4: eta=0.3, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## [09:18:24] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [09:18:24] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold4: eta=0.3, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## + Fold4: eta=0.3, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## [09:20:21] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [09:20:21] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold4: eta=0.3, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## + Fold5: eta=0.1, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## [09:20:58] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [09:20:58] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold5: eta=0.1, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## + Fold5: eta=0.1, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## [09:22:10] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [09:22:10] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold5: eta=0.1, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## + Fold5: eta=0.1, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## [09:23:54] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [09:23:55] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold5: eta=0.1, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## + Fold5: eta=0.3, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## [09:24:29] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [09:24:29] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold5: eta=0.3, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## + Fold5: eta=0.3, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## [09:25:35] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [09:25:35] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold5: eta=0.3, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## + Fold5: eta=0.3, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## [09:27:18] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [09:27:18] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold5: eta=0.3, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500
## Aggregating results
## Selecting tuning parameters
## Fitting nrounds = 500, max_depth = 2, eta = 0.1, gamma = 0, colsample_bytree = 1, min_child_weight = 1, subsample = 1 on full training set
xgb_tune
## eXtreme Gradient Boosting
##
## 50000 samples
## 15 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 40000, 40001, 39999, 40000, 40000
## Resampling results across tuning parameters:
##
## eta max_depth nrounds Accuracy Kappa
## 0.1 2 500 0.8858598 0.7588373
## 0.1 2 1000 0.8849398 0.7572350
## 0.1 2 1500 0.8834598 0.7543262
## 0.1 4 500 0.8833399 0.7540533
## 0.1 4 1000 0.8803399 0.7480462
## 0.1 4 1500 0.8789999 0.7455985
## 0.1 6 500 0.8811599 0.7497944
## 0.1 6 1000 0.8778799 0.7432704
## 0.1 6 1500 0.8771799 0.7419598
## 0.3 2 500 0.8832398 0.7539887
## 0.3 2 1000 0.8810798 0.7498776
## 0.3 2 1500 0.8786198 0.7449194
## 0.3 4 500 0.8761000 0.7398405
## 0.3 4 1000 0.8726400 0.7330412
## 0.3 4 1500 0.8718599 0.7314719
## 0.3 6 500 0.8748399 0.7375824
## 0.3 6 1000 0.8743799 0.7366667
## 0.3 6 1500 0.8729199 0.7338152
##
## Tuning parameter 'gamma' was held constant at a value of 0
## Tuning
##
## Tuning parameter 'min_child_weight' was held constant at a value of 1
##
## Tuning parameter 'subsample' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were nrounds = 500, max_depth = 2, eta
## = 0.1, gamma = 0, colsample_bytree = 1, min_child_weight = 1 and subsample = 1.
best_params <- xgb_tune$bestTune # nrounds 500, depth 2, eta 0.1
# Final model -- grid
final_grid <- expand.grid(
nrounds = best_params$nrounds,
max_depth = best_params$max_depth,
eta = best_params$eta,
gamma = best_params$gamma,
colsample_bytree = best_params$colsample_bytree,
min_child_weight = best_params$min_child_weight,
subsample = best_params$subsample
)
xgb_final <- train(x = xgboost_train_data_x,
y = xgboost_train_data[, 16],
trControl = train_control,
tuneGrid = final_grid,
method = "xgbTree",
verbose = TRUE)
## + Fold1: nrounds=500, max_depth=2, eta=0.1, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1
## - Fold1: nrounds=500, max_depth=2, eta=0.1, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1
## + Fold2: nrounds=500, max_depth=2, eta=0.1, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1
## - Fold2: nrounds=500, max_depth=2, eta=0.1, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1
## + Fold3: nrounds=500, max_depth=2, eta=0.1, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1
## - Fold3: nrounds=500, max_depth=2, eta=0.1, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1
## + Fold4: nrounds=500, max_depth=2, eta=0.1, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1
## - Fold4: nrounds=500, max_depth=2, eta=0.1, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1
## + Fold5: nrounds=500, max_depth=2, eta=0.1, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1
## - Fold5: nrounds=500, max_depth=2, eta=0.1, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1
## Aggregating results
## Fitting final model on full training set
xgb_final
## eXtreme Gradient Boosting
##
## 50000 samples
## 15 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 40001, 40000, 40000, 40000, 39999
## Resampling results:
##
## Accuracy Kappa
## 0.88618 0.759444
##
## Tuning parameter 'nrounds' was held constant at a value of 500
## Tuning
## held constant at a value of 1
## Tuning parameter 'subsample' was held
## constant at a value of 1
# Misclassification error rate of 11.56%.
boost_preds <- predict(xgb_final, newdata = xgboost_test_data_x, type = "raw")
table(boost_preds, xgboost_test_data$reordered)
##
## boost_preds 0 1
## 0 3923 317
## 1 1128 7132
mean(boost_preds==xgboost_test_data$reordered) # Correctly predicts 88.44% of the time.
## [1] 0.8844
boost_test_error <- 1-mean(boost_preds==xgboost_test_data$reordered)
# Evaluate model
library(pROC)
roc_curve_boost <- roc(xgboost_test_data$reordered, as.numeric(boost_preds))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_curve_boost, main = "ROC Curve", col = "blue")
auc_boost <- auc(roc_curve_boost)
print(paste("AUC:", auc_boost)) # AUC: 86.80
## [1] "AUC: 0.867060918887782"
## Plots of best tuning hyperparameters
tuning_results <- data.frame(xgb_tune$results)
# heatmap
ggplot(tuning_results, aes(x = max_depth, y = eta, z = Accuracy)) +
geom_tile(aes(fill = Accuracy)) + # Color represents accuracy
scale_fill_viridis_c() +
labs(title = "XGBoost Tuning - Accuracy Heatmap",
x = "Max Depth",
y = "Learning Rate (eta)",
z = "Accuracy") +
theme_Publication() +
theme(legend.key.width = unit(1.5, "cm"))
# Line Chart
ggplot(tuning_results, aes(x = nrounds, y = Accuracy, color = eta)) +
geom_line() +
facet_wrap(~ max_depth) + # Facet by max_depth
labs(title = "XGBoost Tuning - Accuracy by Parameters",
x = "Number of Trees (nrounds)",
y = "Accuracy",
color = "Learning Rate (eta)") +
theme_Publication() +
theme(legend.key.width = unit(1, "cm"))
set.seed(55)
# Create a subset of data to use
svm_subset_data <- train_data[sample(nrow(train_data), 10000),]
svm_train_index <- sample(1:nrow(svm_subset_data), 0.8*nrow(svm_subset_data))
svm_train_data <- svm_subset_data[svm_train_index,]
svm_test_data <- svm_subset_data[-svm_train_index,]
library(e1071)
# Find best hyperparameters for SVM model
tune.out <- tune(svm,
reordered ~ order_number + order_dow + order_hour_of_day + days_since_prior_order +
add_to_cart_order + avg_hour + avg_day + product_count + product_count_day +
product_count_hour + order_size + avg_order_size + total_reorders +
product_reorder_ratio + reorder_rate,
data = svm_train_data,
kernel = "radial",
ranges = list(
cost = c(1, 10),
gamma = c(0.5, 1)
)
)
tune.out
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 1 0.5
##
## - best performance: 0.1191082
final_hypers <- tune.out$best.parameters
results <- as.data.frame(tune.out$performances)
# Plotting cost vs. gamma with performance as color
ggplot(results, aes(x = cost, y = gamma, fill = -error)) +
geom_tile() +
scale_fill_gradient(low = "yellow", high = "red", labels = round(1 - results$error, 2)) +
labs(title = "Hyperparameter Tuning Results",
x = "Cost",
y = "Gamma",
fill = "Accuracy") +
theme_Publication() +
theme(legend.key.width = unit(1, "cm"))
## Error in `scale_fill_gradient()`:
## ! `breaks` and `labels` have different lengths.
# Support Vector Machines
svmfit <- svm(reordered ~ order_number + order_dow + order_hour_of_day + days_since_prior_order +
add_to_cart_order + avg_hour + avg_day + product_count + product_count_day +
product_count_hour + order_size + avg_order_size + total_reorders + product_reorder_ratio + reorder_rate,
data = svm_train_data,
kernel = "radial",
gamma = final_hypers$gamma,
cost = final_hypers$cost)
summary(svmfit)
##
## Call:
## svm(formula = reordered ~ order_number + order_dow + order_hour_of_day +
## days_since_prior_order + add_to_cart_order + avg_hour + avg_day +
## product_count + product_count_day + product_count_hour + order_size +
## avg_order_size + total_reorders + product_reorder_ratio + reorder_rate,
## data = svm_train_data, kernel = "radial", gamma = final_hypers$gamma,
## cost = final_hypers$cost)
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: radial
## cost: 1
## gamma: 0.5
## epsilon: 0.1
##
##
## Number of Support Vectors: 6461
svm_preds <- predict(svmfit, newdata = svm_test_data)
svm_class_preds <- ifelse(svm_preds > 0.5, 1, 0)
table(svm_class_preds, svm_test_data$reordered) # Correctly predicts 84.7% of the time.
##
## svm_class_preds 0 1
## 0 563 57
## 1 276 1104
mean(svm_class_preds==svm_test_data$reordered) # Misclassification error rate of 15.3%
## [1] 0.8335
svm_test_error <- 1-mean(svm_class_preds==svm_test_data$reordered)
# Plot AUC
roc_curve_svm <- roc(svm_test_data$reordered, svm_preds)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_curve_svm, main = "ROC Curve", col = "blue")
auc_svm <- auc(roc_curve_svm)
print(paste("AUC:", auc_svm))
## [1] "AUC: 0.889341624241976"
final_test_errors <- c(logit_test_error, lda_test_error, qda_test_error,
knn_test_error, tree_test_error, rf_test_error,
boost_test_error, svm_test_error)
model_names <- c("Logit", "LDA", "QDA", "kNN", "Classification
Tree",
"Random Forest", "Boosting", "SVM")
# Plot
ggplot(data.frame(model_names, final_test_errors), aes(model_names, final_test_errors, fill = as.factor(model_names))) +
geom_bar(stat = "identity") +
labs(title = "Test Error Comparison",
x = "Model Names",
y = "Misclassification Rate") +
theme_Publication() +
theme(legend.position = "none",
axis.text.x = element_text(size = 9)) +
scale_colour_Publication() +
geom_text(aes(label = round(final_test_errors, 3)), vjust = -0.5, size = 3)
# Combine ROC curve data into one dataset
roc_data <- rbind(data.frame(Model = "Logit", FPR = roc_curve_logit$specificities, TPR = roc_curve_logit$sensitivities),
data.frame(Model = "LDA", FPR = roc_curve_lda$specificities, TPR = roc_curve_lda$sensitivities),
data.frame(Model = "QDA", FPR = roc_curve_qda$specificities, TPR = roc_curve_qda$sensitivities),
data.frame(Model = "kNN", FPR = roc_curve_knn$specificities, TPR = roc_curve_knn$sensitivities),
data.frame(Model = "Tree", FPR = roc_curve_tree$specificities, TPR = roc_curve_tree$sensitivities),
data.frame(Model = "Random Forest", FPR = roc_curve_rf$specificities, TPR = roc_curve_rf$sensitivities),
data.frame(Model = "Boosting", FPR = roc_curve_boost$specificities, TPR = roc_curve_boost$sensitivities),
data.frame(Model = "SVM", FPR = roc_curve_svm$specificities, TPR = roc_curve_svm$sensitivities))
# Repeat for other models...
# Plot ROC curves for all models
ggplot(roc_data, aes(x = FPR, y = TPR, color = Model)) +
geom_line() +
geom_abline(intercept = 0, slope = 1, linetype = "dashed") + # Add diagonal line for reference
labs(title = "ROC Curves for Different Models",
x = "False Positive Rate",
y = "True Positive Rate",
color = "Model") +
theme_minimal()
# Combine AUC data into one dataset
auc_data <- data.frame(Model = model_names,
AUC = c(auc_logit, auc_lda, auc_qda, auc_knn, auc_tree, auc_rf, auc_boost, auc_svm))
# Plot AUC Scores for all models
ggplot(auc_data, aes(Model, AUC, fill = Model)) +
geom_bar(stat = "identity") +
labs(title = "AUC Comparison",
x = "Model Names",
y = "AUC") +
theme_Publication() +
theme(legend.position = "none",
axis.text.x = element_text(size = 9)) +
scale_colour_Publication() +
geom_text(aes(label = round(AUC, 3)), vjust = -0.5, size = 3)